'''
* This is the projet for Brtc LlmOps Platform
* @Author Leon-liao <liaosiliang@alltman.com>
* @Description //TODO 
* @File: 14_use_llm_splitter.py
* @Time: 2025/10/29
* @All Rights Reserve By Brtc
'''
import dotenv
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import TextSplitter

dotenv.load_dotenv()
#1、构建加载器和文本分割
loader = UnstructuredFileLoader("./data.txt")
text_splitter = SemanticChunker(
    embeddings=OpenAIEmbeddings(model = "text-embedding-3-small"),
    sentence_split_regex=r"(?<=[。？！])",
    number_of_chunks=20,
    add_start_index=True
)
#2、加载文本与分割
documents = loader.load()
chunks = text_splitter.split_documents(documents)
for chunk in chunks:
    print(f"块大小：{len(chunk.page_content)}, 元数据：{chunk.metadata}")
